R Markdown

# Load the necessary libraries
library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(syuzhet)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:syuzhet':
## 
##     rescale
library(reshape2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#Read President George Walker Bush's Inaugural speeches
BOtext<- readLines('C:/Users/skimitei/Desktop/r_code/BO.txt')

# Load the data speech as a corpus
BO <- Corpus(VectorSource(BOtext))
#############Text transformation - Text wrangling in Text Mining#############

#----------------------------------------------------------------------------
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
BO<- tm_map(BO, toSpace, "/")
## Warning in tm_map.SimpleCorpus(BO, toSpace, "/"): transformation drops documents
BO<- tm_map(BO, toSpace, "@")
## Warning in tm_map.SimpleCorpus(BO, toSpace, "@"): transformation drops documents
# Convert the text to lower case
BO <- tm_map(BO, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(BO, content_transformer(tolower)): transformation
## drops documents
# Remove numbers
BO <- tm_map(BO, removeNumbers)
## Warning in tm_map.SimpleCorpus(BO, removeNumbers): transformation drops
## documents
# Remove english common stopwords
BO <- tm_map(BO, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(BO, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
BO<- tm_map(BO, removeWords, c("blabla1", "blabla2")) 
## Warning in tm_map.SimpleCorpus(BO, removeWords, c("blabla1", "blabla2")):
## transformation drops documents
# Remove punctuations
BO <- tm_map(BO, removePunctuation)
## Warning in tm_map.SimpleCorpus(BO, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
BO <- tm_map(BO, stripWhitespace)
## Warning in tm_map.SimpleCorpus(BO, stripWhitespace): transformation drops
## documents
# Text stemming
# docs <- tm_map(docs, stemDocument)
BO <- tm_map(BO, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(BO, toSpace, "\\|"): transformation drops
## documents
# Build a term-document matrix of President George Bush Jr's inaugural speeches
# and display the top ten most used words
dtm <- TermDocumentMatrix(BO)
#find associations with words by correlation with the findAssocs() function
findAssocs(dtm, c("country" , "american","nation", "freedom","love"), corlimit=0.60) # specifying a correlation limit of 0.60
## $country
##        make        away commitments        home     illness       lives 
##        0.72        0.63        0.63        0.63        0.63        0.63 
##        loss       lucky      matter    medicaid    medicare   recognize 
##        0.63        0.63        0.63        0.63        0.63        0.63 
##    reserved       risks         sap      social      sudden       swept 
##        0.63        0.63        0.63        0.63        0.63        0.63 
##      takers    terrible 
##        0.63        0.63 
## 
## $american
## ultimately       need     person     single     acting      bring      equip 
##       0.66       0.66       0.66       0.66       0.63       0.63       0.63 
##       ever   fidelity     forces   freedoms       labs       math   militias 
##       0.63       0.63       0.63       0.63       0.63       0.63       0.63 
##    muskets   networks preserving   research  responses   soldiers   teachers 
##       0.63       0.63       0.63       0.63       0.63       0.63       0.63 
##      times <U+0092>ll 
##       0.63       0.63 
## 
## $nation
## numeric(0)
## 
## $freedom
##         king        along        bound      declare      evident        falls 
##         0.63         0.60         0.60         0.60         0.60         0.60 
##   footprints    forebears       guides         hear inextricably         left 
##         0.60         0.60         0.60         0.60         0.60         0.60 
##     preacher        selma       seneca         soul         star    stonewall 
##         0.60         0.60         0.60         0.60         0.60         0.60 
##         sung       unsung         walk 
##         0.60         0.60         0.60 
## 
## $love
##           another            anyone        appalachia            bright 
##              1.00              1.00              1.00              1.00 
##          brothers             cared         cherished            commit 
##              1.00              1.00              1.00              1.00 
##          complete         daughters           detroit              earn 
##              1.00              1.00              1.00              1.00 
##           efforts         engineers          enlisted          exercise 
##              1.00              1.00              1.00              1.00 
##          expelled            forced               gay             hills 
##              1.00              1.00              1.00              1.00 
##           hopeful        immigrants             lanes            living 
##              1.00              1.00              1.00              1.00 
##           mothers           newtown          pioneers             right 
##              1.00              1.00              1.00              1.00 
##           sisters           streets          striving          students 
##              1.00              1.00              1.00              1.00 
##           treated             truly              vote              wait 
##              1.00              1.00              1.00              1.00 
##           welcome             wives         workforce           journey 
##              1.00              1.00              1.00              0.93 
##             equal          applause              land             young 
##              0.83              0.74              0.70              0.70 
##             began            surely               see             quiet 
##              0.70              0.70              0.70              0.70 
##              safe              else           citizen              harm 
##              0.70              0.70              0.70              0.70 
## generation<U+0092>s 
##              0.70
# convert dtm into a matrix
dtm<-as.matrix(dtm)
#dtm[1:10,1:10]  
w <- sort(rowSums(dtm),decreasing=TRUE)
BO <- data.frame(word = names(w),freq=w)
head(BO, 10)
##              word freq
## will         will   39
## applause applause   25
## must         must   24
## can           can   20
## nation     nation   18
## people     people   18
## new           new   17
## time         time   16
## every       every   15
## america   america   14
rownames(BO)<-NULL
head(BO, 10)
##        word freq
## 1      will   39
## 2  applause   25
## 3      must   24
## 4       can   20
## 5    nation   18
## 6    people   18
## 7       new   17
## 8      time   16
## 9     every   15
## 10  america   14
dtm<-subset(w,w>=5)
v<-dtm
#Create a barplot of the top words as used by President Walker Bush in his inaugural - Version I
barplot(dtm,las=2,col=rainbow(100))

# Print the ten most frequent word used in the addresses
BO <- data.frame(word = names(v),freq=v)
head(BO, 10)
##              word freq
## will         will   39
## applause applause   25
## must         must   24
## can           can   20
## nation     nation   18
## people     people   18
## new           new   17
## time         time   16
## every       every   15
## america   america   14
rownames(BO)<-NULL
head(BO, 10)
##        word freq
## 1      will   39
## 2  applause   25
## 3      must   24
## 4       can   20
## 5    nation   18
## 6    people   18
## 7       new   17
## 8      time   16
## 9     every   15
## 10  america   14
#Create a barplot of the top words as used by President Walker Bush in his inaugural  - Version II
barplot(BO[1:50,]$freq, las = 2, names.arg = BO[1:50,]$word,
        col ="lightblue", main ="President Barack Obama's Most frequent inauguration words",
        ylab = "Word frequencies")

# Sentimental Analysis
s<-get_nrc_sentiment(BOtext)
#View the sentimental analysis of the first 6 paragraphs
head(s)
##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            0       0    0   0       0        0     1        0        1
## 2     0            0       0    0   0       0        0     0        0        0
## 3     0            1       1    1   1       2        1     5        1        7
## 4     0            0       0    0   0       0        0     0        0        0
## 5     2            3       1    1   2       0        0     3        2        6
## 6     0            0       0    0   0       0        0     0        0        0
#View the third paragraph
BOtext[3]
## [1] "I stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors. I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition."
#Check the sentimental analysis of some words in the third paragraph
get_nrc_sentiment("grateful")
##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            0       0    0   0       0        0     0        0        1
get_nrc_sentiment("generosity")
##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            1       0    0   1       0        1     1        0        1
#View the third paragraph
BOtext[9]
## [1] "These are the indicators of crisis, subject to data and statistics. Less measurable but no less profound is a sapping of confidence across our land; a nagging fear that America's decline is inevitable, and that the next generation must lower its sights."
#Check the sentimental analysis of some words in the third paragraph
get_nrc_sentiment("lower")
##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            0       0    0   0       1        0     0        1        0
get_nrc_sentiment("decline")
##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            0       0    0   0       0        0     0        1        0
barplot(colSums(s),las=2,col=rainbow(10),
        ylab='count',main="President Barak Obama's.inaugural sentimental plot")

# Create a word cloud of President Barack Obama's inaugural speeches
# print(BO)
set.seed(1234)
library(wordcloud2)
wordcloud(words = BO$word, freq = BO$freq, min.freq = 2,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

# Get sentiments using the four different lexicons
syuzhet <- get_sentiment(BOtext, method="syuzhet")
bing <- get_sentiment(BOtext, method="bing")
afinn <- get_sentiment(BOtext, method="afinn")
nrc <- get_sentiment(BOtext, method="nrc")
sentiments <- data.frame(syuzhet, bing, afinn, nrc)
# get the emotions using the NRC dictionary
emotions <- get_nrc_sentiment(BOtext)
emo_bar = colSums(emotions)
emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
# plot the different sentiments from different methods
plot_ly(sentiments, x=~emotions, y=~syuzhet, type="scatter", mode="jitter", name="syuzhet") %>%
  add_trace(y=~bing, mode="lines", name="bing") %>%
  add_trace(y=~afinn, mode="lines", name="afinn") %>%
  add_trace(y=~nrc, mode="lines", name="nrc") %>%
  layout(title="President Barack Obama's inaugural speech sentiment",
         yaxis=list(title="score"), xaxis=list(title="Emotions"))